In [2]:
import gymnasium as gym

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
from collections import deque

env = gym.make("Pendulum-v1", render_mode="human")

""" state: Box(x_position, velocity), length=2
    actions: Box(force_on_car), length=1

    -ve: left, +ve: right
"""

def test_run(n_ep, env):
    
    for _ in range(n_ep):
        observation, info = env.reset()
        total_reward = 0

        while True:
            action = env.action_space.sample()
            observation, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            
            total_reward += reward
            # print(total_reward)
            
            if done:
                break

# test_run(1, env)
# env.close()

In [11]:
class policy_net(nn.Module):
    def __init__(self, in_dim, out_dim) -> None:
        super().__init__()
        # note to self: nn.Linear() represents the transformation, not the matrices themselves.
        self.il = nn.Linear(in_dim, 50)
        self.relu = nn.ReLU()

        self.mean_l = nn.Linear(50, out_dim)
        self.log_std_l = nn.Linear(50, out_dim)
    def forward(self, x):
        x = F.relu(self.il(x))

        mean = self.mean_l(x)
        log_std = self.log_std_l(x)
        return mean, log_std

def generate_episode(policy_net, _env):
    # we keep the means as means because we need both action prediction and mean of distribution
    states, log_probs, rewards = [], [], []
    # reset env, get the starting state
    state, info = _env.reset()
    # single-goal oriented environment, can track successes, yay
    successes = 0

    while True:
        # add current state to states list
        states.append(state)
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        
        # get the predicted mean of distri for current state, add to list of means
        mean, log_std = policy_net.forward(state)
        std = torch.exp(log_std)
        
        gauss = torch.distributions.Normal(mean, std)
        action = gauss.sample()
        action_np = action.detach().numpy().squeeze(0)
        
        log_prob_tensor = gauss.log_prob(action).unsqueeze(0)
        log_probs.append(log_prob_tensor)

        state, reward, terminated, truncated, info = _env.step(action_np)
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        
        reward = torch.tensor(reward)
        rewards.append(reward)
        
        if terminated or truncated:
            if rewards[-1] == 1:
                successes += 1
            break

    return log_probs, rewards

def REINFORCE(_env, n_ep):
    state_dim = _env.observation_space.shape[0]
    action_dim = _env.action_space.shape[0]
    policy = policy_net(state_dim, action_dim)
    optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3)
    gamma = 0.99
        
    for _ in range(n_ep):
        log_probs, rewards = generate_episode(policy, _env)
        returns = []
        policy_loss = []
        disc_return = 0
        
        for R in reversed(rewards):
            disc_return = R + gamma * disc_return
            returns.insert(0, disc_return) 
            
        returns = torch.tensor(returns, dtype=torch.float32)
        returns = (returns - returns.mean()) / (returns.std() + 1e-9)
        
        for G, L in zip(returns, log_probs):
            policy_loss.append(-L * G)
        
        returns = torch.tensor(returns)
        policy_loss = torch.cat(policy_loss).sum()
        print(policy_loss)
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
    return policy

In [12]:
env = gym.make("Pendulum-v1")

policy = REINFORCE(env, 1000)
env.close()

  state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (3x1 and 3x50)

In [7]:
def test_policy_net(policy, _env):
    observation, info = _env.reset()
    observation = torch.tensor(observation)

    while True:
        mean, log_std = policy.forward(observation)
        std = torch.exp(log_std)
        
        gauss = torch.distributions.Normal(mean, std)
        action = gauss.sample()
        
        observation, reward, terminated, truncated, info = _env.step(action)
        observation = torch.tensor(observation)
        
        if terminated or truncated or reward < -100:
            break

env = gym.make("Pendulum-v1", render_mode="human")
test_policy_net(policy, env)
env.close()

In [10]:
env = gym.make("Pendulum-v1", render_mode="human")

states, log_probs, rewards = [], [], []
state, info = env.reset()

state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
states.append(state_tensor)

mean, log_std = policy.forward(state_tensor)
std = torch.exp(log_std)

gauss = torch.distributions.Normal(mean, std)
action_tensor = gauss.sample()

# --- FIX 1: Sum log probabilities for multi-dimensional actions ---
# The total log-prob of an action vector is the sum of the log-probs of its components.
log_prob_tensor = gauss.log_prob(action_tensor).sum(axis=-1, keepdim=True)
log_probs.append(log_prob_tensor)
env.close()
