In [1]:
import sys
sys.path.append("..")
import models
import policy_gradient_methods
import utils
import gymnasium as gym
import torch
import random

# Settings

In [2]:
ENVIRONMENT_ID = "CartPole-v1"
NUM_EPISODES = 10000
GRAD_CLIP_VALUE = 100
GAMMA = 0.9
NN_HIDDEN_LAYER_SIZES = [8, 4]
RNG_SEED = 7
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print("PyTorch device:", device)
torch.manual_seed(RNG_SEED)
random.seed(RNG_SEED)

PyTorch device: cpu


# REINFORCE

## Linear model

In [3]:
def select_action_softmax(model, state):
    with torch.no_grad():
        return torch.distributions.categorical.Categorical(model(state)).sample().item()

def train_episodic_reinforce(env, policy_model, loss_func, optimizer, device, rng_seed, num_episodes, gamma, grad_clip_value=None):
    returns = []
    for episode in range(num_episodes):
        # Initiate episode
        # Set seed only one time per training run. For more info, see https://gymnasium.farama.org/api/env/.
        seed = rng_seed if episode == 0 else None
        observation, info = env.reset(seed=seed)
        state = torch.tensor(observation, device=device)
        truncated = False
        terminated = False
        G = 0
        states = [state]
        actions = []
        rewards = []

        # Generate episode
        while not (terminated or truncated):
            action = select_action_softmax(policy_model, state)
            observation, reward, terminated, truncated, info = env.step(action)
            next_state = torch.tensor(observation, device=device)
            states.append(next_state)
            actions.append(action)
            rewards.append(reward)
            G += reward
            state = next_state
        returns.append(G)

        # Learn from episode

        
    return returns, states, actions, rewards

In [5]:
env = gym.make(ENVIRONMENT_ID)
env.action_space.seed(RNG_SEED)
model = torch.nn.Sequential(
    torch.nn.Linear(env.observation_space.shape[0], env.action_space.n).to(device),
    torch.nn.Softmax(dim=0)
)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
loss_func = torch.nn.MSELoss()
observation, info = env.reset(seed=RNG_SEED)
state = torch.tensor(observation, device=device)
action = select_action_softmax(model, state)
with torch.no_grad():
    print(model(state))
print(action)
returns, states, actions, rewards = train_episodic_reinforce(
    env, model, loss_func, optimizer, device, RNG_SEED, 1, GAMMA, GRAD_CLIP_VALUE
)
print("returns:", returns)
print("states:", states)
print(len(states))
print("actions:", actions)
print(len(actions))
print("rewards:", rewards)
print(len(rewards))
env.close()
#utils.plot_returns(returns)

tensor([0.6210, 0.3790])
0
returns: [15.0]
states: [tensor([ 0.0125,  0.0397,  0.0276, -0.0275]), tensor([ 0.0133, -0.1558,  0.0270,  0.2738]), tensor([ 0.0102, -0.3513,  0.0325,  0.5749]), tensor([ 0.0032, -0.1566,  0.0440,  0.2926]), tensor([3.0043e-05, 3.7838e-02, 4.9843e-02, 1.4091e-02]), tensor([ 0.0008, -0.1580,  0.0501,  0.3221]), tensor([-0.0024, -0.3538,  0.0566,  0.6301]), tensor([-0.0094, -0.5496,  0.0692,  0.9401]), tensor([-0.0204, -0.3555,  0.0880,  0.6699]), tensor([-0.0276, -0.5517,  0.1014,  0.9889]), tensor([-0.0386, -0.3581,  0.1211,  0.7297]), tensor([-0.0457, -0.5547,  0.1357,  1.0580]), tensor([-0.0568, -0.7513,  0.1569,  1.3900]), tensor([-0.0719, -0.5584,  0.1847,  1.1502]), tensor([-0.0830, -0.7554,  0.2077,  1.4946]), tensor([-0.0981, -0.5634,  0.2376,  1.2733])]
16
actions: [0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1]
15
rewards: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
15


## Neural network model

# REINFORCE with baseline

## Linear model

## Neural network model