# Cross Entropy Method

## CartPole Example

In [1]:
import torch
import torch.nn as nn

class SimpleNet(nn.Module):
    def __init__(self, obs_size: int, hidden_size: int, n_actions: int) -> None:
        super().__init__()
        
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)
    

In [2]:
import gymnasium as gym
from gymnasium.wrappers import NumpyToTorch
import torch.optim as optim

from examples.cross_entropy import save_net_spec
from velora.utils import load_config

config = load_config("config/cp_ce.yaml")
env: gym.Env = NumpyToTorch(gym.make(config.env.name))

obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = SimpleNet(obs_size, config.model.hidden_size, n_actions)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), **config.optimizer)

save_net_spec(net, "saved/")

In [6]:
import wandb

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: achronus (achronus-uk). Use `wandb login --relogin` to force relogin


True

In [7]:
from examples.cross_entropy import train_cartpole

train_cartpole(env, net, loss, optimizer, config, run_idx=4)

0: loss=0.691, reward_mean=38.0, rw_bound=43.0
1: loss=0.673, reward_mean=27.4, rw_bound=25.5
2: loss=0.674, reward_mean=28.2, rw_bound=32.5
3: loss=0.658, reward_mean=44.3, rw_bound=42.5
4: loss=0.643, reward_mean=37.0, rw_bound=42.5
5: loss=0.633, reward_mean=40.9, rw_bound=55.5
6: loss=0.649, reward_mean=37.2, rw_bound=44.5
7: loss=0.613, reward_mean=37.6, rw_bound=40.0
8: loss=0.627, reward_mean=38.1, rw_bound=45.5
9: loss=0.598, reward_mean=54.6, rw_bound=61.0
10: loss=0.619, reward_mean=51.8, rw_bound=57.0
11: loss=0.623, reward_mean=55.2, rw_bound=61.5
12: loss=0.598, reward_mean=54.2, rw_bound=65.5
13: loss=0.582, reward_mean=69.4, rw_bound=77.5
14: loss=0.596, reward_mean=69.9, rw_bound=73.0
15: loss=0.596, reward_mean=62.8, rw_bound=72.5
16: loss=0.589, reward_mean=67.0, rw_bound=77.5
17: loss=0.568, reward_mean=72.4, rw_bound=85.5
18: loss=0.579, reward_mean=78.8, rw_bound=85.5
19: loss=0.572, reward_mean=61.1, rw_bound=66.5
20: loss=0.552, reward_mean=70.0, rw_bound=72.5
21

0,1
ep_idx,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
loss,█▇▇▇▆▆▆▅▅▄▅▅▄▄▄▄▄▃▄▃▃▂▃▂▃▃▁▂▂▁▂▂▁▁▁▂▁
reward_bound,▂▁▁▂▂▂▂▁▂▂▂▂▂▃▃▃▃▃▃▂▃▃▃▃▄▄▄▄▄▅▅▆▆▆▆▆█
reward_mean,▁▁▁▂▁▂▁▁▁▂▂▂▂▃▃▂▃▃▃▂▃▃▃▃▄▄▄▄▄▄▅▅▆▆▆▆█

0,1
ep_idx,36.0
loss,0.52581
reward_bound,232.0
reward_mean,210.125


In [4]:
import numpy as np

class DiscreteObservationsToVector(gym.ObservationWrapper):
    """One hot encodes a discrete observation space."""
    def __init__(self, env: gym.Env) -> None:
        super().__init__(env)

        assert isinstance(env.observation_space, gym.spaces.Discrete)

        shape = (env.observation_space.n,)
        self.observation_space: gym.spaces.Box = gym.spaces.Box(0.0, 1.0, shape, dtype=np.float32)

    def observation(self, observation: int) -> np.ndarray:
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res
    
    def step(self, action):
        # Fixes bug with 'NumPyToTorch' wrapper
        action = action.item() if isinstance(action, np.ndarray) else action
        observation, reward, terminated, truncated, info = self.env.step(action)
        return self.observation(observation), reward, terminated, truncated, info

## FrozenLake Example

In [5]:
config = load_config("config/cp_fl.yaml")
env2: gym.Env = NumpyToTorch(DiscreteObservationsToVector(gym.make(config.env.name, is_slippery=False)))

obs_size = env2.observation_space.shape[0]
n_actions = env2.action_space.n

net2 = SimpleNet(obs_size, config.model.hidden_size, n_actions)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net2.parameters(), **config.optimizer)

In [6]:
from examples.cross_entropy import train_frozenlake

train_frozenlake(env2, net2, loss, optimizer, config, run_idx=1)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: achronus (achronus-uk). Use `wandb login --relogin` to force relogin


0: loss=1.394, reward_mean=0.010, rw_bound=0.000, batch=1
1: loss=1.341, reward_mean=0.030, rw_bound=0.000, batch=3
2: loss=1.296, reward_mean=0.049, rw_bound=0.000, batch=5
3: loss=1.248, reward_mean=0.086, rw_bound=0.000, batch=9
4: loss=1.216, reward_mean=0.138, rw_bound=0.000, batch=15
5: loss=1.183, reward_mean=0.191, rw_bound=0.000, batch=22
6: loss=1.128, reward_mean=0.270, rw_bound=0.000, batch=33
7: loss=1.028, reward_mean=0.421, rw_bound=0.239, batch=40
8: loss=0.980, reward_mean=0.421, rw_bound=0.314, batch=47
9: loss=0.897, reward_mean=0.490, rw_bound=0.349, batch=53
10: loss=0.753, reward_mean=0.608, rw_bound=0.430, batch=47
11: loss=0.626, reward_mean=0.680, rw_bound=0.478, batch=51
12: loss=0.502, reward_mean=0.675, rw_bound=0.531, batch=46
13: loss=0.440, reward_mean=0.637, rw_bound=0.531, batch=66
14: loss=0.382, reward_mean=0.765, rw_bound=0.531, batch=94
15: loss=0.333, reward_mean=0.825, rw_bound=0.531, batch=135
Solved!


0,1
batch,▁▁▁▁▂▂▃▃▃▄▃▄▃▄▆█
ep_idx,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██
loss,██▇▇▇▇▆▆▅▅▄▃▂▂▁▁
reward_bound,▁▁▁▁▁▁▁▄▅▆▇▇████
reward_mean,▁▁▁▂▂▃▃▅▅▅▆▇▇▆▇█

0,1
batch,135.0
ep_idx,15.0
loss,0.33294
reward_bound,0.53144
reward_mean,0.82474
